from bs4 import BeautifulSoup
import pandas
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import math
from datetime import datetime


keywords = ["вакцина"]

# output file
myfile = "LINKS_ukraina_ru_vaccine.csv"

#This should never be less than 1. Do not change to more than 1 unless you see a printed message telling you to do so. 
wait_time = 1

# Dates in the format year, month, day
my_start_date = datetime(2020, 11, 1).date()
my_end_date = datetime(2021, 11, 30).date()



correct_url_list = []
total_url_list = []
base_url = "https://ukraina.ru"

for keyword in keywords:
    search_url = "https://ukraina.ru/search/?query=" + keyword

    #Getting annoying warning messages about handshake failed, added code from here: https://stackoverflow.com/questions/37883759/errorssl-client-socket-openssl-cc1158-handshake-failed-with-chromedriver-chr
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(options=options)
    driver.get(search_url)

    # Finding the number of articles so we know how many times to press the next button
    initial_soup = BeautifulSoup(driver.page_source, "html.parser")
    num_articles = initial_soup.find("div", {"class": "search__results-count"}).text.split(" ")[-1]
    print("for the keyword " + keyword + ", the number of articles for all time is: " + str(num_articles))

    #Round UP the result of the number of articles divided by 10 because need to reach through the last ones, even if just over a multiple of 10
    cycles = math.ceil(int(num_articles) / 10)
    print("The maximum number of times clicking load more is: " + str(cycles))

    for i in range(cycles):
        print("We are on cycle: " + str(i))
        # Apparently unlike BeautifulSoup, Selenium can't handle spaces, so you have to replace them with periods:
        # https://stackoverflow.com/questions/65395557/selenium-find-element-by-class-name-and-find-element-by-css-selector-not-working
        load_more_button = driver.find_element_by_class_name("input-button.b-btn.m-more.b-btn.m-more")

        # for some reason click wasn't working but doing an "enter" (\n) on the element does work
        load_more_button.send_keys("\n")

        # You need to give it a little time to actually load the results after you click the button. 
        # If internet speed is slow need to increase the amount of time 
        time.sleep(wait_time)

        # Stop clicking if we reach the desired end date on the last article
        scrolling_soup = BeautifulSoup(driver.page_source, "html.parser")
        date = scrolling_soup.find_all("article", {"class": "rubric-list__article"})[-1].find("time").text
        #Needs to be capital Y if there are four digits in the year (lowercase for 2)
        # If the article is today it won't have a date and will just say the time, so need to tell the code that if it doesn't find the date in the right format, the date is today
        try:
            datetime_obj = datetime.strptime(date, "%d.%m.%Y")
            datetime_date_obj = datetime_obj.date()
        except:
            datetime_date_obj = datetime.now().date()
        #If the start date is before the date of the article, need to stop the whole process
        if my_start_date > datetime_date_obj:
            break

    # Now get the full soup and add the links
    soup = BeautifulSoup(driver.page_source, "html.parser")
    urls_messy = soup.find_all("article", {"class": "rubric-list__article"})
    for url_messy in urls_messy:
        url = base_url + url_messy.find("a")["href"]
        total_url_list.append(url)
        date = url_messy.find("time").text
        try:
            datetime_obj = datetime.strptime(date, "%d.%m.%Y")
            datetime_date_obj = datetime_obj.date()
        except:
            datetime_date_obj = datetime.now().date()
        if my_start_date <= datetime_date_obj and my_end_date >= datetime_date_obj:
            correct_url_list.append(url)

    driver.quit()



if len(total_url_list) < 11:
    print("Your internet is slow. Please increase wait_time variable and re-run the code until you no longer see this message.")
    quit()

print("Before removing duplicates, the number of articles in the correct date range is: " + str(len(correct_url_list)))

unique_url_list = []

for url in correct_url_list:
    if url not in unique_url_list:
        unique_url_list.append(url)

print("After removing duplicates we have: " + str(len(unique_url_list)))


#This just puts it into a dictionary dataframe rather than a list which makes life somewhat easier because then the column has a label
l = []

for url in unique_url_list:
    d = {}
    d["url"] = url
    l.append(d)


df = pandas.DataFrame(l)
df.to_csv(myfile, encoding = "utf-8-sig")






